Building an IP Proxy Pool: Crawler Proxy Pool

How to Resolve IP Blocking?

Modify request headers to mimic browser behavior (instead of direct code access).
Use and rotate proxies.
Configure access intervals.
Acquire proxy IP addresses.
Purchase a website for research purposes.

Code Implementation
The following code extracts proxy IP addresses from the HTML element with class="odd":

from bs4 import BeautifulSoup
import requests
import time

def open_proxy_url(url):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
    headers = {'User-Agent': user_agent}
    try:
        r = requests.get(url, headers=headers, timeout=20)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        print(f'Unable to access webpage: {url}')

def get_proxy_ip(response):
    proxy_ip_list = []
    soup = BeautifulSoup(response, 'html.parser')
    proxy_ips = soup.select('.odd')  # Select elements with class="odd"
    for proxy_ip in proxy_ips:
        ip = proxy_ip.select('td')[1].text
        port = proxy_ip.select('td')[2].text
        protocol = proxy_ip.select('td')[5].text
        if protocol in ('HTTP', 'HTTPS'):
            proxy_ip_list.append(f'{protocol}://{ip}:{port}')
    return proxy_ip_list

if __name__ == '__main__':
    proxy_url = 'https://www.xicidaili.com/'
    text = open_proxy_url(proxy_url)
    proxy_ip_filename = 'proxy_ip.txt'
    with open(proxy_ip_filename, 'w') as f:
        f.write(text)
    text = open(proxy_ip_filename, 'r').read()
    proxy_ip_list = get_proxy_ip(text)
    print(proxy_ip_list)

Issue with Missing Data
Some proxy IPs are not captured because they lack class="odd". Modify the parser to include all <tr> tags under id="ip_list":

def get_proxy_ip(response):
    proxy_ip_list = []
    soup = BeautifulSoup(response, 'html.parser')
    proxy_ips = soup.find(id='ip_list').find_all('tr')
    for proxy_ip in proxy_ips:
        if len(proxy_ip.select('td')) >= 8:
            ip = proxy_ip.select('td')[1].text
            port = proxy_ip.select('td')[2].text
            protocol = proxy_ip.select('td')[5].text.lower()  # Normalize protocol
            if protocol in ('http', 'https'):
                proxy_ip_list.append(f'{protocol}://{ip}:{port}')
    return proxy_ip_list

Using Proxies
Proxies are passed as a dictionary to the requests method:

def open_url_using_proxy(url, proxy):
    user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36'
    headers = {'User-Agent': user_agent}
    proxies = {}
    if proxy.startswith(('https', 'HTTPS')):
        proxies['https'] = proxy
    else:
        proxies['http'] = proxy
    try:
        r = requests.get(url, headers=headers, proxies=proxies, timeout=10)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return (r.text, r.status_code)
    except:
        print(f'Unable to access webpage: {url}')
        print(f'Invalid proxy IP: {proxy}')
        return False

Proxy Validation
Verify proxy effectiveness by checking status codes and page titles (e.g., Baidu):

def check_proxy_avaliability(proxy):
    url = 'http://www.baidu.com'
    result = open_url_using_proxy(url, proxy)
    if result:
        text, status_code = result
        if status_code == 200:
            soup = BeautifulSoup(text, 'html.parser')
            title = soup.find('title').text
            if title == 'Baidu - Search':
                print(f'Valid proxy IP: {proxy}')
                return True
    print(f'Invalid proxy IP: {proxy}')
    return False

HTTP vs. HTTPS Proxies

HTTP proxies handle HTTP requests only.
HTTPS proxies handle HTTPS requests.
Example proxy dictionary:

  proxies = {
      'http': 'http://10.10.1.10:3128',
      'https': 'https://10.10.1.11:1080'
  }

Use platforms like JSON IP to validate proxies.

References

Requests Proxies Documentation: requests.readthedocs.io
BeautifulSoup Documentation: www.crummy.com/software/BeautifulSoup/bs4/doc/